{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "738a9c28",
   "metadata": {},
   "source": [
    "# Import taxonomy dataset into Neo4j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f5c21d72",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from graphdatascience import GraphDataScience\n",
    "\n",
    "host = \"bolt://44.193.28.203:7687\"\n",
    "user = \"neo4j\"\n",
    "password = \"combatants-coordinates-tugs\"\n",
    "gds = GraphDataScience(host, auth=(user, password))\n",
    "\n",
    "def batch_import(query, params_df, batch_size=25_000):\n",
    "    params = params_df.to_dict('records')\n",
    "    for i in range(0, len(params), batch_size):\n",
    "        batch_data = params[i:i+batch_size]\n",
    "        gds.run_cypher(query, {'data': batch_data})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "40cb2623",
   "metadata": {},
   "source": [
    "Download the [new taxonomy dump files](https://ncbiinsights.ncbi.nlm.nih.gov/2018/02/22/new-taxonomy-files-available-with-lineage-type-and-host-information/) and move them in the same folder where this Jupyter notebook is located \n",
    "\n",
    "*Make sure to install the APOC plugin in Neo4j*"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe2ff031",
   "metadata": {},
   "source": [
    "# Import nodes\n",
    "\n",
    "\ttax_id\t\t\t\t\t          -- node id in GenBank taxonomy database\n",
    " \tparent tax_id\t\t\t\t      -- parent node id in GenBank taxonomy database\n",
    " \trank\t\t\t\t\t          -- rank of this node (superkingdom, kingdom, ...) \n",
    " \tembl code\t\t\t\t          -- locus-name prefix; not unique\n",
    " \tdivision id\t\t\t\t          -- see division.dmp file\n",
    " \tinherited div flag  (1 or 0)      -- 1 if node inherits division from parent\n",
    " \tgenetic code id\t\t\t\t      -- see gencode.dmp file\n",
    " \tinherited GC  flag  (1 or 0)      -- 1 if node inherits genetic code from parent\n",
    " \tmitochondrial genetic code id     -- see gencode.dmp file\n",
    " \tinherited MGC flag  (1 or 0)      -- 1 if node inherits mitochondrial gencode from parent\n",
    " \tGenBank hidden flag (1 or 0)      -- 1 if name is suppressed in GenBank entry lineage\n",
    " \thidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet\n",
    " \tcomments\t\t\t\t          -- free-text comments and citations\n",
    "    plastid genetic code id           -- see gencode.dmp file\n",
    "    inherited PGC flag  (1 or 0)      -- 1 if node inherits plastid gencode from parent\n",
    "\tspecified_species\t\t\t      -- 1 if species in the node's lineage has formal name\n",
    "    hydrogenosome genetic code id     -- see gencode.dmp file\n",
    "    inherited HGC flag  (1 or 0)      -- 1 if node inherits hydrogenosome gencode from parent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4ff66995",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tomaz/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3437: DtypeWarning: Columns (12) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "nodes = pd.read_csv('nodes.dmp', delimiter=\"|\", header=None, quotechar='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f30e7a3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_columns = [\"tax_id\", \"parent tax_id\", \"rank\", \"embl code\", \"division id\", \"inherited div flag\",\n",
    "                \"genetic code id\", \"inherited GC flag\", \"mitochondrial genetic code id\", \"inherited MGC flag\",\n",
    "                \"GenBank hidden flag\", \"hidden subtree root flag\", \"comments\", \"plastid genetic code id\",\n",
    "                \"inherited PGC flag\", \"specified_species\", \"hydrogenosome genetic code id\", \"inherited HGC flag\", \n",
    "                \"None\"]\n",
    "nodes.columns = node_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b71de37f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tax_id</th>\n",
       "      <th>parent tax_id</th>\n",
       "      <th>rank</th>\n",
       "      <th>embl code</th>\n",
       "      <th>division id</th>\n",
       "      <th>inherited div flag</th>\n",
       "      <th>genetic code id</th>\n",
       "      <th>inherited GC flag</th>\n",
       "      <th>mitochondrial genetic code id</th>\n",
       "      <th>inherited MGC flag</th>\n",
       "      <th>GenBank hidden flag</th>\n",
       "      <th>hidden subtree root flag</th>\n",
       "      <th>comments</th>\n",
       "      <th>plastid genetic code id</th>\n",
       "      <th>inherited PGC flag</th>\n",
       "      <th>specified_species</th>\n",
       "      <th>hydrogenosome genetic code id</th>\n",
       "      <th>inherited HGC flag</th>\n",
       "      <th>None</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>no rank</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>131567</td>\n",
       "      <td>superkingdom</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>335928</td>\n",
       "      <td>genus</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>species</td>\n",
       "      <td>AC</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9</td>\n",
       "      <td>32199</td>\n",
       "      <td>species</td>\n",
       "      <td>BA</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   tax_id  parent tax_id          rank embl code  division id  \\\n",
       "0       1              1       no rank       NaN            8   \n",
       "1       2         131567  superkingdom       NaN            0   \n",
       "2       6         335928         genus       NaN            0   \n",
       "3       7              6       species        AC            0   \n",
       "4       9          32199       species        BA            0   \n",
       "\n",
       "   inherited div flag  genetic code id  inherited GC flag  \\\n",
       "0                   0                1                  0   \n",
       "1                   0               11                  0   \n",
       "2                   1               11                  1   \n",
       "3                   1               11                  1   \n",
       "4                   1               11                  1   \n",
       "\n",
       "   mitochondrial genetic code id  inherited MGC flag  GenBank hidden flag  \\\n",
       "0                              0                   0                    0   \n",
       "1                              0                   0                    0   \n",
       "2                              0                   1                    0   \n",
       "3                              0                   1                    1   \n",
       "4                              0                   1                    1   \n",
       "\n",
       "   hidden subtree root flag comments  plastid genetic code id  \\\n",
       "0                         0      NaN                      NaN   \n",
       "1                         0      NaN                      NaN   \n",
       "2                         0      NaN                      NaN   \n",
       "3                         0      NaN                      NaN   \n",
       "4                         0      NaN                      NaN   \n",
       "\n",
       "   inherited PGC flag  specified_species  hydrogenosome genetic code id  \\\n",
       "0                 NaN                  0                            0.0   \n",
       "1                 NaN                  0                            0.0   \n",
       "2                 NaN                  0                            0.0   \n",
       "3                 NaN                  1                            0.0   \n",
       "4                 NaN                  1                            0.0   \n",
       "\n",
       "   inherited HGC flag  None  \n",
       "0                   0   NaN  \n",
       "1                   1   NaN  \n",
       "2                   1   NaN  \n",
       "3                   1   NaN  \n",
       "4                   1   NaN  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9208c484",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2425415"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c4385416",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\"\"\"\n",
    "CREATE CONSTRAINT IF NOT EXISTS FOR (n:Node) REQUIRE n.id IS UNIQUE;\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1eaa10d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "create_nodes_query = \"\"\"\n",
    "UNWIND $data AS row\n",
    "CREATE (n:Node {id: row.tax_id})\n",
    "SET n += apoc.map.clean(row, ['tax_id', 'parent tax_id', 'rank'], [\"\", gds.util.NaN()])\n",
    "WITH n, row.rank AS rank\n",
    "CALL apoc.create.addLabels(n, [apoc.text.capitalize(rank)]) YIELD node\n",
    "RETURN distinct 'done'\n",
    "\"\"\"\n",
    "\n",
    "batch_import(create_nodes_query, nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "306736ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "parent_relation_import_query =\"\"\"\n",
    "UNWIND $data AS row\n",
    "MATCH (child:Node {id:row.tax_id})\n",
    "MATCH (parent:Node {id:row.`parent tax_id`})\n",
    "MERGE (child)-[:PARENT]->(parent)\n",
    "\"\"\"\n",
    "\n",
    "batch_import(parent_relation_import_query, nodes[['tax_id', 'parent tax_id']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "098c19bd",
   "metadata": {},
   "source": [
    "# Import names\n",
    "\n",
    "names.dmp\n",
    "---------\n",
    "Taxonomy names file has these fields:\n",
    "\n",
    "\ttax_id\t\t\t\t\t-- the id of node associated with this name\n",
    "\tname_txt\t\t\t\t-- name itself\n",
    "\tunique name\t\t\t\t-- the unique variant of this name if name not unique\n",
    "\tname class\t\t\t\t-- (synonym, common name, ...)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "09d859e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "names = pd.read_csv('names.dmp', delimiter=\"|\", header=None, quotechar='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4b3a2a8b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tax_id</th>\n",
       "      <th>name_txt</th>\n",
       "      <th>unique_name</th>\n",
       "      <th>name_class</th>\n",
       "      <th>none</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>all</td>\n",
       "      <td>NaN</td>\n",
       "      <td>synonym</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>root</td>\n",
       "      <td>NaN</td>\n",
       "      <td>scientific name</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Bacteria</td>\n",
       "      <td>Bacteria &lt;bacteria&gt;</td>\n",
       "      <td>scientific name</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>bacteria</td>\n",
       "      <td>NaN</td>\n",
       "      <td>blast name</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>eubacteria</td>\n",
       "      <td>NaN</td>\n",
       "      <td>genbank common name</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   tax_id    name_txt          unique_name           name_class  none\n",
       "0       1         all                  NaN              synonym   NaN\n",
       "1       1        root                  NaN      scientific name   NaN\n",
       "2       2    Bacteria  Bacteria <bacteria>      scientific name   NaN\n",
       "3       2    bacteria                  NaN           blast name   NaN\n",
       "4       2  eubacteria                  NaN  genbank common name   NaN"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_columns = ['tax_id', 'name_txt', 'unique_name', 'name_class', 'none']\n",
    "names.columns = name_columns\n",
    "names.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "90eadba6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_names_query = \"\"\"\n",
    "UNWIND $data AS row\n",
    "MATCH (n:Node {id: row.tax_id})\n",
    "SET n.name = row.name_txt\n",
    "\"\"\"\n",
    "\n",
    "batch_import(import_names_query, names[['tax_id', 'name_txt']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09a70b7e",
   "metadata": {},
   "source": [
    "# Import hosts \n",
    "\n",
    "host.dmp\n",
    "--------\n",
    "Theoretical host for organism file fields:\n",
    "\n",
    "\ttax_id\t\t\t\t\t-- node id\n",
    "\tpotential_hosts\t\t\t\t-- theoretical host list separated by comma ','"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f6fa263c",
   "metadata": {},
   "outputs": [],
   "source": [
    "hosts = pd.read_csv('host.dmp', delimiter=\"|\", header=None, quotechar='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "62e8f61a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tax_id</th>\n",
       "      <th>potential_hosts</th>\n",
       "      <th>none</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>562</td>\n",
       "      <td>bacteria,vertebrates</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>666</td>\n",
       "      <td>bacteria</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>686</td>\n",
       "      <td>bacteria</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1280</td>\n",
       "      <td>human</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1307</td>\n",
       "      <td>bacteria</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   tax_id       potential_hosts  none\n",
       "0     562  bacteria,vertebrates   NaN\n",
       "1     666              bacteria   NaN\n",
       "2     686              bacteria   NaN\n",
       "3    1280                 human   NaN\n",
       "4    1307              bacteria   NaN"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hosts.columns = ['tax_id', 'potential_hosts', 'none']\n",
    "hosts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e39a27bd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gds.run_cypher(\"\"\"\n",
    "CREATE INDEX IF NOT EXISTS FOR (n:Node) ON (n.name)\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a5a0563",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_hosts_query = \"\"\"\n",
    "UNWIND $data AS row\n",
    "MATCH (n:Node {id: row.tax_id})\n",
    "UNWIND split(row.potential_hosts, ',') AS host\n",
    "MATCH (h:Node {name: host})\n",
    "MERGE (n)-[:POTENTIAL_HOST]->(h)\n",
    "\"\"\"\n",
    "batch_import(import_hosts_query, hosts)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2a1fe84",
   "metadata": {},
   "source": [
    "# import citations\n",
    "\n",
    "citations.dmp\n",
    "-------------\n",
    "Citations file fields:\n",
    "\n",
    "\tcit_id\t\t\t\t-- the unique id of citation\n",
    "\tcit_key\t\t\t\t-- citation key\n",
    "    medline_id          -- unique id in MedLine database (0 if not in MedLine)\n",
    "\tpubmed_id\t\t    -- unique id in PubMed database (0 if not in PubMed)\n",
    "\turl\t\t\t\t\t-- URL associated with citation\n",
    "\ttext\t\t\t\t-- any text (usually article name and authors)\n",
    "\t\t\t\t\t\t-- The following characters are escaped in this text by a backslash:\n",
    "\t\t\t\t\t\t-- newline (appear as \"\\n\"),\n",
    "\t\t\t\t\t\t-- tab character (\"\\t\"),\n",
    "\t\t\t\t\t\t-- double quotes ('\\\"'),\n",
    "\t\t\t\t\t\t-- backslash character (\"\\\\\").\n",
    "\ttaxid_list\t\t\t-- list of node ids separated by a single space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2969c064",
   "metadata": {},
   "outputs": [],
   "source": [
    "citations = pd.read_csv('citations.dmp', delimiter=\"|\", header=None, quotechar='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06348067",
   "metadata": {},
   "outputs": [],
   "source": [
    "citations.columns = ['cit_id', 'cit_key', 'medline_id', 'pubmed_id', 'url', 'text', 'taxid_list', 'none']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac94ee50",
   "metadata": {},
   "outputs": [],
   "source": [
    "citations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "930a0f6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "gds.run_cypher('CREATE CONSTRAINT FOR (c:Citation) REQUIRE c.id IS UNIQUE;')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e348e184",
   "metadata": {},
   "outputs": [],
   "source": [
    "import_citations_query = \"\"\"\n",
    "UNWIND $data AS row\n",
    "MERGE (c:Citation {id: row.cit_id})\n",
    "SET c += apoc.map.clean(row, ['cit_id', 'none', 'taxid_list'], ['0', gds.util.NaN()])\n",
    "WITH c, row.taxid_list AS taxid_list\n",
    "WHERE toString(taxid_list) <> \"NaN\"\n",
    "UNWIND split(toString(taxid_list), ' ') AS tax_id\n",
    "WITH c, tax_id\n",
    "MATCH (n:Node {id: toInteger(tax_id)})\n",
    "MERGE (c)-[:MENTIONS]->(n);\n",
    "\"\"\"\n",
    "\n",
    "batch_import(import_citations_query, citations, 5000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c47e5767",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
